package com.writing.util;

import lombok.extern.slf4j.Slf4j;

import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;

/**
 * 文件编码检测工具类
 */
@Slf4j
public class EncodingDetector {
    
    // 常用编码列表，按优先级排序
    private static final List<String> COMMON_ENCODINGS = Arrays.asList(
        "UTF-8", "GBK", "GB2312", "UTF-16LE", "UTF-16BE", "ISO-8859-1"
    );
    
    /**
     * 检测文件编码
     * 
     * @param bytes 文件字节数组
     * @return 检测到的编码
     */
    public static String detectEncoding(byte[] bytes) {
        if (bytes == null || bytes.length == 0) {
            return "UTF-8";
        }
        
        // 1. 检测BOM
        String bomEncoding = detectBOM(bytes);
        if (bomEncoding != null) {
            log.info("通过BOM检测到编码: {}", bomEncoding);
            return bomEncoding;
        }
        
        // 2. 尝试各种编码，选择最佳匹配
        String bestEncoding = "UTF-8";
        double bestScore = 0;
        
        for (String encoding : COMMON_ENCODINGS) {
            try {
                String content = new String(bytes, encoding);
                double score = calculateEncodingScore(content, bytes, encoding);
                
                log.debug("编码 {} 的评分: {}", encoding, score);
                
                if (score > bestScore) {
                    bestScore = score;
                    bestEncoding = encoding;
                }
            } catch (Exception e) {
                log.debug("编码 {} 解析失败: {}", encoding, e.getMessage());
            }
        }
        
        log.info("选择编码: {} (评分: {})", bestEncoding, bestScore);
        return bestEncoding;
    }
    
    /**
     * 检测BOM（Byte Order Mark）
     */
    private static String detectBOM(byte[] bytes) {
        if (bytes.length >= 3) {
            // UTF-8 BOM: EF BB BF
            if (bytes[0] == (byte) 0xEF && bytes[1] == (byte) 0xBB && bytes[2] == (byte) 0xBF) {
                return "UTF-8";
            }
        }
        
        if (bytes.length >= 2) {
            // UTF-16 BE BOM: FE FF
            if (bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) {
                return "UTF-16BE";
            }
            // UTF-16 LE BOM: FF FE
            if (bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE) {
                return "UTF-16LE";
            }
        }
        
        return null;
    }
    
    /**
     * 计算编码的匹配评分
     */
    private static double calculateEncodingScore(String content, byte[] originalBytes, String encoding) {
        double score = 0;
        
        // 1. 检查是否有替换字符（乱码标志）
        long replacementChars = content.chars().filter(ch -> ch == 0xFFFD).count();
        if (replacementChars > 0) {
            score -= replacementChars * 10; // 每个替换字符扣10分
        }
        
        // 2. 检查字符分布合理性
        score += calculateCharacterDistributionScore(content);
        
        // 3. 检查编码一致性（重新编码后是否一致）
        score += calculateEncodingConsistencyScore(content, originalBytes, encoding);
        
        // 4. 检查文本合理性
        score += calculateTextReasonabilityScore(content);
        
        // 5. 编码特定的加分
        score += getEncodingSpecificBonus(content, encoding);
        
        return Math.max(0, score);
    }
    
    /**
     * 计算字符分布评分
     */
    private static double calculateCharacterDistributionScore(String content) {
        if (content.isEmpty()) {
            return 0;
        }
        
        int totalChars = content.length();
        int validChars = 0;
        int chineseChars = 0;
        int englishChars = 0;
        int digitChars = 0;
        int punctuationChars = 0;
        int controlChars = 0;
        
        for (int i = 0; i < content.length(); i++) {
            char ch = content.charAt(i);
            
            if (Character.isLetter(ch)) {
                validChars++;
                if (isChinese(ch)) {
                    chineseChars++;
                } else if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z') {
                    englishChars++;
                }
            } else if (Character.isDigit(ch)) {
                validChars++;
                digitChars++;
            } else if (isPunctuation(ch)) {
                validChars++;
                punctuationChars++;
            } else if (Character.isWhitespace(ch)) {
                validChars++;
            } else if (Character.isISOControl(ch) && ch != '\n' && ch != '\r' && ch != '\t') {
                controlChars++;
            }
        }
        
        double validRatio = (double) validChars / totalChars;
        double controlRatio = (double) controlChars / totalChars;
        
        double score = validRatio * 100; // 有效字符比例
        score -= controlRatio * 200; // 控制字符惩罚
        
        // 如果有中文字符，给GBK系列编码加分
        if (chineseChars > totalChars * 0.1) {
            score += 20;
        }
        
        return score;
    }
    
    /**
     * 计算编码一致性评分
     */
    private static double calculateEncodingConsistencyScore(String content, byte[] originalBytes, String encoding) {
        try {
            byte[] reEncodedBytes = content.getBytes(encoding);
            
            // 计算字节数组的相似度
            int minLength = Math.min(originalBytes.length, reEncodedBytes.length);
            int matchingBytes = 0;
            
            for (int i = 0; i < minLength; i++) {
                if (originalBytes[i] == reEncodedBytes[i]) {
                    matchingBytes++;
                }
            }
            
            double similarity = (double) matchingBytes / Math.max(originalBytes.length, reEncodedBytes.length);
            return similarity * 50; // 最高50分
            
        } catch (Exception e) {
            return 0;
        }
    }
    
    /**
     * 计算文本合理性评分
     */
    private static double calculateTextReasonabilityScore(String content) {
        if (content.isEmpty()) {
            return 0;
        }
        
        double score = 0;
        
        // 检查是否包含常见的文本模式
        if (content.contains("第") && (content.contains("章") || content.contains("节"))) {
            score += 30; // 包含章节标识
        }
        
        if (content.matches(".*[。！？；，].*")) {
            score += 20; // 包含中文标点
        }
        
        if (content.matches(".*[.!?;,].*")) {
            score += 10; // 包含英文标点
        }
        
        // 检查行结构合理性
        String[] lines = content.split("\n");
        if (lines.length > 1) {
            int reasonableLines = 0;
            for (String line : lines) {
                if (line.trim().length() > 0 && line.trim().length() < 1000) {
                    reasonableLines++;
                }
            }
            double lineRatio = (double) reasonableLines / lines.length;
            score += lineRatio * 20;
        }
        
        return score;
    }
    
    /**
     * 获取编码特定的加分
     */
    private static double getEncodingSpecificBonus(String content, String encoding) {
        double bonus = 0;
        
        // 统计中文字符数量
        long chineseCharCount = content.chars().filter(EncodingDetector::isChinese).count();
        double chineseRatio = (double) chineseCharCount / content.length();
        
        // 如果包含大量中文字符，给中文编码加分
        if (chineseRatio > 0.1) {
            if ("GBK".equals(encoding) || "GB2312".equals(encoding)) {
                bonus += 30;
            }
        }
        
        // 如果主要是英文字符，给UTF-8加分
        long englishCharCount = content.chars()
                .filter(ch -> (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))
                .count();
        double englishRatio = (double) englishCharCount / content.length();
        
        if (englishRatio > 0.5 && "UTF-8".equals(encoding)) {
            bonus += 20;
        }
        
        return bonus;
    }
    
    /**
     * 判断是否为中文字符
     */
    private static boolean isChinese(int ch) {
        Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
        return block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
               block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
               block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
               block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS ||
               block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
    }
    
    /**
     * 判断是否为标点符号
     */
    private static boolean isPunctuation(char ch) {
        return Character.getType(ch) == Character.OTHER_PUNCTUATION ||
               Character.getType(ch) == Character.START_PUNCTUATION ||
               Character.getType(ch) == Character.END_PUNCTUATION ||
               Character.getType(ch) == Character.INITIAL_QUOTE_PUNCTUATION ||
               Character.getType(ch) == Character.FINAL_QUOTE_PUNCTUATION ||
               Character.getType(ch) == Character.DASH_PUNCTUATION;
    }
    
    /**
     * 验证编码是否可用
     */
    public static boolean isEncodingSupported(String encoding) {
        try {
            return Charset.isSupported(encoding);
        } catch (Exception e) {
            return false;
        }
    }
}
